#Loading Libraries

library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(boot)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lmboot)
library(lattice)
## 
## Attaching package: 'lattice'
## 
## The following object is masked from 'package:boot':
## 
##     melanoma
library(caret)
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(naniar)
library(utils)
library(stats)

##Reading in Dataset

setwd("/Users/xaviermojica/Desktop/Stats2/Project1") #/Users/xaviermojica/
life = read.csv("Life Expectancy Data (1).csv")
ggplot(data = life) + geom_point(mapping = aes(x = GDP, y = Life.expectancy))
## Warning: Removed 453 rows containing missing values (`geom_point()`).

##Upon looking at the graph of the original data set, it appears that there needs to be a log transformation on the X or the GDP as we are interested in seeing the relation between Life Expenctancy and GDP.

##Checking Data Types

str(life)
## 'data.frame':    2938 obs. of  22 variables:
##  $ Country                        : chr  "Afghanistan" "Afghanistan" "Afghanistan" "Afghanistan" ...
##  $ Year                           : int  2015 2014 2013 2012 2011 2010 2009 2008 2007 2006 ...
##  $ Status                         : chr  "Developing" "Developing" "Developing" "Developing" ...
##  $ Life.expectancy                : num  65 59.9 59.9 59.5 59.2 58.8 58.6 58.1 57.5 57.3 ...
##  $ Adult.Mortality                : int  263 271 268 272 275 279 281 287 295 295 ...
##  $ infant.deaths                  : int  62 64 66 69 71 74 77 80 82 84 ...
##  $ Alcohol                        : num  0.01 0.01 0.01 0.01 0.01 0.01 0.01 0.03 0.02 0.03 ...
##  $ percentage.expenditure         : num  71.3 73.5 73.2 78.2 7.1 ...
##  $ Hepatitis.B                    : int  65 62 64 67 68 66 63 64 63 64 ...
##  $ Measles                        : int  1154 492 430 2787 3013 1989 2861 1599 1141 1990 ...
##  $ BMI                            : num  19.1 18.6 18.1 17.6 17.2 16.7 16.2 15.7 15.2 14.7 ...
##  $ under.five.deaths              : int  83 86 89 93 97 102 106 110 113 116 ...
##  $ Polio                          : int  6 58 62 67 68 66 63 64 63 58 ...
##  $ Total.expenditure              : num  8.16 8.18 8.13 8.52 7.87 9.2 9.42 8.33 6.73 7.43 ...
##  $ Diphtheria                     : int  65 62 64 67 68 66 63 64 63 58 ...
##  $ HIV.AIDS                       : num  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 ...
##  $ GDP                            : num  584.3 612.7 631.7 670 63.5 ...
##  $ Population                     : num  33736494 327582 31731688 3696958 2978599 ...
##  $ thinness..1.19.years           : num  17.2 17.5 17.7 17.9 18.2 18.4 18.6 18.8 19 19.2 ...
##  $ thinness.5.9.years             : num  17.3 17.5 17.7 18 18.2 18.4 18.7 18.9 19.1 19.3 ...
##  $ Income.composition.of.resources: num  0.479 0.476 0.47 0.463 0.454 0.448 0.434 0.433 0.415 0.405 ...
##  $ Schooling                      : num  10.1 10 9.9 9.8 9.5 9.2 8.9 8.7 8.4 8.1 ...
vis_miss(life)

dim(life)
## [1] 2938   22
View(life)

#Imputing using Median

#GDP 15% [17]  keep GDP to have it Imputed even if quite high percentage, assuming it is crucial to predicting Life.expectancy as richer countries have better health access/Medicine and tech. The numbers appear to be GDP per capita which helps as it addresses GDP/Population. GDP per Capita and Population would be too closely related and prob attribute to covariance.  
#Adjusting text angle to vis_miss
imputeMedian= preProcess(life[,-c(1:4,9)],method="medianImpute") #predictors 1:4, 9 and response is 4
cleandataMedian = predict(imputeMedian,newdata=life)
dim(cleandataMedian)
## [1] 2938   22
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))

#Literature says that over 10% missing data can contribute to bias 
#HepatitsB [9] at 19% , Population 22% [18]. 
#Removing columns 9 and 18
cleandataMedian = cleandataMedian[,-c(18,9)]
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))

#removing last NA
cleandataMedian = na.omit(cleandataMedian)
vis_miss(cleandataMedian) + theme(axis.text.x = element_text(angle = 90, hjust = 0))

ggplot(data = cleandataMedian) + geom_point(mapping = aes(x = GDP, y = Life.expectancy))

#Converting GDP to Log 
cleandataMedian$logGDP = log(cleandataMedian$GDP)
#converting Life.expectancy to log 
cleandataMedian$logLife.expectancy = log(cleandataMedian$Life.expectancy)

#Log transformation on GDP 
ggplot(data = cleandataMedian) + geom_point(mapping = aes(x = logGDP, y = logLife.expectancy))

#Imputing and Removing

#Imputing all save for columns 1:4 and Removing last Na in 
imputeMedian= preProcess(life[,-c(1:4)],method="medianImpute") #predictors 1:4 and response is 4
cleandataMedian1 = predict(imputeMedian,newdata=life)
dim(cleandataMedian1)
## [1] 2938   22
vis_miss(cleandataMedian1) + theme(axis.text.x = element_text(angle = 90, hjust = 0))

#removing last NA <0,1% 
cleandataMedian1 = na.omit(cleandataMedian1)
vis_miss(cleandataMedian1) + theme(axis.text.x = element_text(angle = 90, hjust = 0))

dim(cleandataMedian1)
## [1] 2928   22

#Multivariable Plots

library(ISLR)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(GGally)
library(ggplot2)

ggpairs(cleandataMedian[,5:22], lower = list(continuous = wrap("points", color = "red", alpha = 0.5), combo = wrap("box", color = "orange", alpha = 0.3), discrete = wrap("facetbar", color = "yellow", alpha = 0.3) ),diag = list(continuous = wrap("densityDiag",  color = "blue", alpha = 0.5)))

#ggpairs(cleandataMedian[,5:22], upper = list(continuous = wrap("cor", size = 4.75, align_percent = 1)))
#ggscatmat(cleandataMedian, columns = 5:22)
ggplot(data = cleandataMedian1) + geom_point(mapping = aes(x = GDP, y = Life.expectancy))

#log transformation on GDP 
ggplot(data = cleandataMedian1) + geom_point(mapping = aes(x = log(GDP), y = Life.expectancy))

#Residuals for chosen MLR model

eightVar = lm(Life.expectancy~HIV.AIDS+Schooling+Alcohol+BMI+Polio+Diphtheria+logGDP+thinness..1.19.years+Income.composition.of.resources, data = cleandataMedian)
summary(eightVar)
## 
## Call:
## lm(formula = Life.expectancy ~ HIV.AIDS + Schooling + Alcohol + 
##     BMI + Polio + Diphtheria + logGDP + thinness..1.19.years + 
##     Income.composition.of.resources, data = cleandataMedian)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -27.4403  -2.5682   0.0843   2.7047  18.8786 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     42.994510   0.557803  77.078  < 2e-16 ***
## HIV.AIDS                        -0.678646   0.017783 -38.164  < 2e-16 ***
## Schooling                        0.804939   0.048463  16.609  < 2e-16 ***
## Alcohol                          0.055757   0.026443   2.109   0.0351 *  
## BMI                              0.054950   0.005583   9.842  < 2e-16 ***
## Polio                            0.035731   0.005047   7.079 1.81e-12 ***
## Diphtheria                       0.045743   0.005000   9.148  < 2e-16 ***
## logGDP                           0.616229   0.063235   9.745  < 2e-16 ***
## thinness..1.19.years            -0.120226   0.024449  -4.917 9.26e-07 ***
## Income.composition.of.resources  7.369419   0.717204  10.275  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.634 on 2918 degrees of freedom
## Multiple R-squared:  0.7639, Adjusted R-squared:  0.7632 
## F-statistic:  1049 on 9 and 2918 DF,  p-value: < 2.2e-16
confint(eightVar)
##                                        2.5 %      97.5 %
## (Intercept)                     41.900781759 44.08823816
## HIV.AIDS                        -0.713513984 -0.64377855
## Schooling                        0.709914467  0.89996366
## Alcohol                          0.003908609  0.10760635
## BMI                              0.044002477  0.06589743
## Polio                            0.025833590  0.04562761
## Diphtheria                       0.035938319  0.05554730
## logGDP                           0.492238370  0.74021949
## thinness..1.19.years            -0.168165244 -0.07228603
## Income.composition.of.resources  5.963142186  8.77569544
#Visuals for Residuals
plot(eightVar)

#Forward, Backward, Stepwise Selection

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
#Full Model 
set.seed(1246)
fitFull = lm(Life.expectancy~Adult.Mortality + 
     infant.deaths + Alcohol + percentage.expenditure + Measles + 
     BMI + under.five.deaths + Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + thinness.5.9.years + Income.composition.of.resources + Schooling + logGDP,cleandataMedian)

stepup = stepAIC(fitFull, direction = "forward", steps = 2000)
## Start:  AIC=8219.89
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol + 
##     percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     thinness.5.9.years + Income.composition.of.resources + Schooling + 
##     logGDP
stepdown= stepAIC(fitFull, direction = "backward", steps = 2000)
## Start:  AIC=8219.89
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol + 
##     percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     thinness.5.9.years + Income.composition.of.resources + Schooling + 
##     logGDP
## 
##                                   Df Sum of Sq   RSS    AIC
## - thinness.5.9.years               1       2.3 47947 8218.0
## <none>                                         47945 8219.9
## - thinness..1.19.years             1      72.4 48017 8222.3
## - Total.expenditure                1      96.8 48041 8223.8
## - Measles                          1     139.7 48084 8226.4
## - Alcohol                          1     286.9 48231 8235.4
## - percentage.expenditure           1     329.8 48274 8238.0
## - Polio                            1     617.7 48562 8255.4
## - logGDP                           1     723.7 48668 8261.8
## - Diphtheria                       1    1005.8 48950 8278.7
## - BMI                              1    1054.2 48999 8281.6
## - Income.composition.of.resources  1    1155.8 49100 8287.6
## - infant.deaths                    1    2421.5 50366 8362.2
## - under.five.deaths                1    2445.4 50390 8363.5
## - Schooling                        1    3913.0 51858 8447.6
## - Adult.Mortality                  1   10813.5 58758 8813.4
## - HIV.AIDS                         1   11560.1 59505 8850.4
## 
## Step:  AIC=8218.03
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol + 
##     percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     Income.composition.of.resources + Schooling + logGDP
## 
##                                   Df Sum of Sq   RSS    AIC
## <none>                                         47947 8218.0
## - Total.expenditure                1      95.5 48042 8221.9
## - Measles                          1     141.0 48088 8224.6
## - thinness..1.19.years             1     229.8 48177 8230.0
## - Alcohol                          1     285.9 48233 8233.4
## - percentage.expenditure           1     330.1 48277 8236.1
## - Polio                            1     616.7 48564 8253.4
## - logGDP                           1     721.4 48668 8259.8
## - Diphtheria                       1    1008.9 48956 8277.0
## - BMI                              1    1056.2 49003 8279.8
## - Income.composition.of.resources  1    1156.5 49103 8285.8
## - infant.deaths                    1    2438.7 50386 8361.3
## - under.five.deaths                1    2457.7 50405 8362.4
## - Schooling                        1    3919.7 51867 8446.1
## - Adult.Mortality                  1   10813.7 58761 8811.5
## - HIV.AIDS                         1   11559.5 59506 8848.4
stepboth = stepAIC(fitFull, direction = "both", steps = 2000)
## Start:  AIC=8219.89
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol + 
##     percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     thinness.5.9.years + Income.composition.of.resources + Schooling + 
##     logGDP
## 
##                                   Df Sum of Sq   RSS    AIC
## - thinness.5.9.years               1       2.3 47947 8218.0
## <none>                                         47945 8219.9
## - thinness..1.19.years             1      72.4 48017 8222.3
## - Total.expenditure                1      96.8 48041 8223.8
## - Measles                          1     139.7 48084 8226.4
## - Alcohol                          1     286.9 48231 8235.4
## - percentage.expenditure           1     329.8 48274 8238.0
## - Polio                            1     617.7 48562 8255.4
## - logGDP                           1     723.7 48668 8261.8
## - Diphtheria                       1    1005.8 48950 8278.7
## - BMI                              1    1054.2 48999 8281.6
## - Income.composition.of.resources  1    1155.8 49100 8287.6
## - infant.deaths                    1    2421.5 50366 8362.2
## - under.five.deaths                1    2445.4 50390 8363.5
## - Schooling                        1    3913.0 51858 8447.6
## - Adult.Mortality                  1   10813.5 58758 8813.4
## - HIV.AIDS                         1   11560.1 59505 8850.4
## 
## Step:  AIC=8218.03
## Life.expectancy ~ Adult.Mortality + infant.deaths + Alcohol + 
##     percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     Income.composition.of.resources + Schooling + logGDP
## 
##                                   Df Sum of Sq   RSS    AIC
## <none>                                         47947 8218.0
## + thinness.5.9.years               1       2.3 47945 8219.9
## - Total.expenditure                1      95.5 48042 8221.9
## - Measles                          1     141.0 48088 8224.6
## - thinness..1.19.years             1     229.8 48177 8230.0
## - Alcohol                          1     285.9 48233 8233.4
## - percentage.expenditure           1     330.1 48277 8236.1
## - Polio                            1     616.7 48564 8253.4
## - logGDP                           1     721.4 48668 8259.8
## - Diphtheria                       1    1008.9 48956 8277.0
## - BMI                              1    1056.2 49003 8279.8
## - Income.composition.of.resources  1    1156.5 49103 8285.8
## - infant.deaths                    1    2438.7 50386 8361.3
## - under.five.deaths                1    2457.7 50405 8362.4
## - Schooling                        1    3919.7 51867 8446.1
## - Adult.Mortality                  1   10813.7 58761 8811.5
## - HIV.AIDS                         1   11559.5 59506 8848.4
#summary for each model 
up = summary(stepup)
up
## 
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths + 
##     Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     thinness.5.9.years + Income.composition.of.resources + Schooling + 
##     logGDP, data = cleandataMedian)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.2860  -2.1523   0.0363   2.3535  15.6953 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      5.182e+01  6.254e-01  82.858  < 2e-16 ***
## Adult.Mortality                 -2.026e-02  7.907e-04 -25.623  < 2e-16 ***
## infant.deaths                    1.004e-01  8.282e-03  12.125  < 2e-16 ***
## Alcohol                          1.007e-01  2.412e-02   4.173 3.09e-05 ***
## percentage.expenditure           2.117e-04  4.730e-05   4.475 7.94e-06 ***
## Measles                         -2.229e-05  7.656e-06  -2.912  0.00362 ** 
## BMI                              3.993e-02  4.990e-03   8.000 1.77e-15 ***
## under.five.deaths               -7.461e-02  6.123e-03 -12.185  < 2e-16 ***
## Polio                            2.718e-02  4.439e-03   6.124 1.03e-09 ***
## Total.expenditure                8.267e-02  3.409e-02   2.425  0.01538 *  
## Diphtheria                       3.461e-02  4.428e-03   7.815 7.64e-15 ***
## HIV.AIDS                        -4.658e-01  1.758e-02 -26.493  < 2e-16 ***
## thinness..1.19.years            -1.059e-01  5.051e-02  -2.096  0.03616 *  
## thinness.5.9.years               1.869e-02  4.979e-02   0.375  0.70745    
## Income.composition.of.resources  5.333e+00  6.367e-01   8.377  < 2e-16 ***
## Schooling                        6.609e-01  4.288e-02  15.414  < 2e-16 ***
## logGDP                           4.139e-01  6.244e-02   6.629 4.02e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.058 on 2911 degrees of freedom
## Multiple R-squared:  0.8194, Adjusted R-squared:  0.8184 
## F-statistic: 825.5 on 16 and 2911 DF,  p-value: < 2.2e-16
down = summary(stepdown)
down
## 
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths + 
##     Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     Income.composition.of.resources + Schooling + logGDP, data = cleandataMedian)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.2742  -2.1492   0.0321   2.3594  15.6889 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      5.185e+01  6.220e-01  83.361  < 2e-16 ***
## Adult.Mortality                 -2.025e-02  7.902e-04 -25.627  < 2e-16 ***
## infant.deaths                    1.006e-01  8.266e-03  12.170  < 2e-16 ***
## Alcohol                          1.005e-01  2.411e-02   4.167 3.17e-05 ***
## percentage.expenditure           2.118e-04  4.730e-05   4.477 7.85e-06 ***
## Measles                         -2.239e-05  7.651e-06  -2.926  0.00346 ** 
## BMI                              3.972e-02  4.959e-03   8.009 1.66e-15 ***
## under.five.deaths               -7.472e-02  6.116e-03 -12.217  < 2e-16 ***
## Polio                            2.716e-02  4.437e-03   6.120 1.06e-09 ***
## Total.expenditure                8.195e-02  3.403e-02   2.408  0.01611 *  
## Diphtheria                       3.465e-02  4.426e-03   7.828 6.90e-15 ***
## HIV.AIDS                        -4.657e-01  1.757e-02 -26.496  < 2e-16 ***
## thinness..1.19.years            -8.917e-02  2.387e-02  -3.736  0.00019 ***
## Income.composition.of.resources  5.335e+00  6.366e-01   8.381  < 2e-16 ***
## Schooling                        6.613e-01  4.286e-02  15.429  < 2e-16 ***
## logGDP                           4.125e-01  6.231e-02   6.619 4.29e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.058 on 2912 degrees of freedom
## Multiple R-squared:  0.8194, Adjusted R-squared:  0.8185 
## F-statistic: 880.8 on 15 and 2912 DF,  p-value: < 2.2e-16
both = summary(stepboth)
both
## 
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths + 
##     Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     Income.composition.of.resources + Schooling + logGDP, data = cleandataMedian)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.2742  -2.1492   0.0321   2.3594  15.6889 
## 
## Coefficients:
##                                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      5.185e+01  6.220e-01  83.361  < 2e-16 ***
## Adult.Mortality                 -2.025e-02  7.902e-04 -25.627  < 2e-16 ***
## infant.deaths                    1.006e-01  8.266e-03  12.170  < 2e-16 ***
## Alcohol                          1.005e-01  2.411e-02   4.167 3.17e-05 ***
## percentage.expenditure           2.118e-04  4.730e-05   4.477 7.85e-06 ***
## Measles                         -2.239e-05  7.651e-06  -2.926  0.00346 ** 
## BMI                              3.972e-02  4.959e-03   8.009 1.66e-15 ***
## under.five.deaths               -7.472e-02  6.116e-03 -12.217  < 2e-16 ***
## Polio                            2.716e-02  4.437e-03   6.120 1.06e-09 ***
## Total.expenditure                8.195e-02  3.403e-02   2.408  0.01611 *  
## Diphtheria                       3.465e-02  4.426e-03   7.828 6.90e-15 ***
## HIV.AIDS                        -4.657e-01  1.757e-02 -26.496  < 2e-16 ***
## thinness..1.19.years            -8.917e-02  2.387e-02  -3.736  0.00019 ***
## Income.composition.of.resources  5.335e+00  6.366e-01   8.381  < 2e-16 ***
## Schooling                        6.613e-01  4.286e-02  15.429  < 2e-16 ***
## logGDP                           4.125e-01  6.231e-02   6.619 4.29e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.058 on 2912 degrees of freedom
## Multiple R-squared:  0.8194, Adjusted R-squared:  0.8185 
## F-statistic: 880.8 on 15 and 2912 DF,  p-value: < 2.2e-16
fitFull
## 
## Call:
## lm(formula = Life.expectancy ~ Adult.Mortality + infant.deaths + 
##     Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths + 
##     Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + 
##     thinness.5.9.years + Income.composition.of.resources + Schooling + 
##     logGDP, data = cleandataMedian)
## 
## Coefficients:
##                     (Intercept)                  Adult.Mortality  
##                       5.182e+01                       -2.026e-02  
##                   infant.deaths                          Alcohol  
##                       1.004e-01                        1.007e-01  
##          percentage.expenditure                          Measles  
##                       2.117e-04                       -2.229e-05  
##                             BMI                under.five.deaths  
##                       3.993e-02                       -7.461e-02  
##                           Polio                Total.expenditure  
##                       2.718e-02                        8.267e-02  
##                      Diphtheria                         HIV.AIDS  
##                       3.461e-02                       -4.658e-01  
##            thinness..1.19.years               thinness.5.9.years  
##                      -1.059e-01                        1.869e-02  
## Income.composition.of.resources                        Schooling  
##                       5.333e+00                        6.609e-01  
##                          logGDP  
##                       4.139e-01
olsrr::ols_step_forward_aic(fitFull)
## 
##                                        Selection Summary                                         
## ------------------------------------------------------------------------------------------------
## Variable                              AIC         Sum Sq         RSS         R-Sq      Adj. R-Sq 
## ------------------------------------------------------------------------------------------------
## Schooling                          19396.624    136605.379    128885.381    0.51454      0.51437 
## Adult.Mortality                    18025.496    184853.664     80637.096    0.69627      0.69606 
## HIV.AIDS                           17468.139    198876.292     66614.468    0.74909      0.74883 
## Diphtheria                         17162.353    205523.215     59967.545    0.77413      0.77382 
## BMI                                16998.736    208821.027     56669.733    0.78655      0.78618 
## logGDP                             16874.136    211219.069     54271.690    0.79558      0.79516 
## Income.composition.of.resources    16792.318    212750.648     52740.112    0.80135      0.80087 
## Polio                              16749.619    213549.676     51941.083    0.80436      0.80382 
## thinness..1.19.years               16718.844    214127.843     51362.917    0.80654      0.80594 
## percentage.expenditure             16699.760    214496.365     50994.395    0.80792      0.80727 
## Measles                            16682.530    214830.184     50660.576    0.80918      0.80846 
## Total.expenditure                  16675.783    214981.292     50509.468    0.80975      0.80897 
## Alcohol                            16671.712    215085.919     50404.841    0.81014      0.80930 
## ------------------------------------------------------------------------------------------------

#KNN

fit_cont = trainControl(method = "repeatedcv", number = 10, repeats = 1)
set.seed(136)

knnfit = train(Life.expectancy~Adult.Mortality + infant.deaths + Alcohol + percentage.expenditure + Measles + BMI + under.five.deaths + Polio + Total.expenditure + Diphtheria + HIV.AIDS + thinness..1.19.years + thinness.5.9.years + Income.composition.of.resources + Schooling + logGDP, data =cleandataMedian, method = "knn", trControl = fit_cont, tuneGrid = expand.grid(k = c(1:30)))

knnfit
## k-Nearest Neighbors 
## 
## 2928 samples
##   16 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times) 
## Summary of sample sizes: 2636, 2635, 2633, 2636, 2635, 2636, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE      Rsquared   MAE     
##    1  5.488143  0.6935591  3.449756
##    2  4.921379  0.7397969  3.185675
##    3  4.896706  0.7389821  3.174618
##    4  4.905568  0.7367200  3.191503
##    5  4.889514  0.7374605  3.204195
##    6  4.936552  0.7319917  3.237000
##    7  4.973915  0.7277472  3.266920
##    8  4.985925  0.7263364  3.289079
##    9  4.980177  0.7272703  3.295459
##   10  4.986556  0.7263713  3.308424
##   11  5.006264  0.7245177  3.326198
##   12  5.027220  0.7222950  3.341819
##   13  5.046430  0.7202132  3.371293
##   14  5.072814  0.7173954  3.393572
##   15  5.117980  0.7123708  3.425087
##   16  5.141204  0.7100030  3.448251
##   17  5.169575  0.7067867  3.476042
##   18  5.210799  0.7022013  3.500357
##   19  5.225452  0.7006864  3.512528
##   20  5.227905  0.7006563  3.519559
##   21  5.245156  0.6989557  3.544645
##   22  5.268221  0.6963622  3.559881
##   23  5.277116  0.6954608  3.575229
##   24  5.291620  0.6939448  3.591443
##   25  5.304875  0.6926038  3.603206
##   26  5.327564  0.6901494  3.622863
##   27  5.350114  0.6876450  3.639980
##   28  5.362781  0.6865690  3.652476
##   29  5.376651  0.6851425  3.664897
##   30  5.397276  0.6828929  3.684026
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 5.

#World Map, Color plotting

#Creating the World 
#library(ggplot2)
#library(tidyverse)
#library(ggthemes)

#world_map = map_data("world") %>% filter(! long > 180)

#countries = world_map %>% distinct(region) %>% rowid_to_column()

#countries %>% ggplot(aes(fill = rowid, map_id = region)) + geom_map(map = world_map) + expand_limits(x = world_map$long, y = world_map$lat) + coord_map("moll") +theme_map()

#Color world plotting

library(ggplot2)
library(tidyverse)

#rename cleandataMedian for the map
dataforcolmap = cleandataMedian
#but first renaming column Country in dataforcolmap 
colnames(dataforcolmap)[1] = "region"
view(dataforcolmap)


#getting map data for plotting 
mapdata = map_data("world")
view(mapdata)

#joining map data with dataforcolmap 
mapdata = left_join(mapdata,dataforcolmap, by = "region")
## Warning in left_join(mapdata, dataforcolmap, by = "region"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 11 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
view(mapdata)

#filtering out NAs for life expectancy , status, Income.composition.of.resources
#Life Exp
mapdata1 = mapdata %>% filter(!is.na(mapdata$Life.expectancy))

#Status 
mapdata2 = mapdata %>% filter(!is.na(mapdata$Status))

#Income
mapdata3 = mapdata %>% filter(!is.na(mapdata$Income.composition.of.resources))


#mapping mapdata1 for Life Exp 

map1 = ggplot(mapdata1, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = Life.expectancy), color = "black")+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), axis.title.y = element_blank(), axis.title.x = element_blank()) + ggtitle("Life Expectancy per Country") + scale_fill_gradient(low = "red", high = "yellow") + guides(fill=guide_legend(title="Life Expectancy"))
map1

#mapping mapdata2 for Status 

mapStatus = ggplot(mapdata2, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = Status, col = "orange"), color = "black") + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), axis.title.y = element_blank(), axis.title.x = element_blank()) + ggtitle("Country's Status: Developed v. Developing") 
mapStatus

#mapping mapdata3 for Income Composition of Resources 

mapIncome = ggplot(mapdata3, aes(x = long, y = lat, group = group)) + geom_polygon(aes(fill = Income.composition.of.resources), color = "black")+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), axis.title.y = element_blank(), axis.title.x = element_blank()) + ggtitle("Income Composition of Resources per Country") + scale_fill_gradient(low = "red", high = "yellow")+ guides(fill=guide_legend(title="Income Composition of Resources"))
mapIncome

# Using rpart library
#treeimb <- rpart(ExplVar ~ ., data = train)
#pred.treeimb <- predict(treeimb, newdata = test)